---
title: "Code for 'Are goals scored just before halftime worth more? An old soccer wisdom statistically tested.'"
output: html_document
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = FALSE, message = FALSE, warning = FALSE, cache = FALSE)
    options(knitr.kable.NA = '')

pacman::p_load(tidyverse, stargazer,
               lfe, tables, ordinal,
               ggpubr, pander, kableExtra, MASS)

theme_set(theme_pubr())
run_ologit <- function(formula, dataset)
{
  polr(formula, data = dataset, method = "logistic", Hess = TRUE)
}
panderOptions('knitr.auto.asis', TRUE)

```

<style type="text/css">

body{ /* Normal  */
      font-size: 10px;
  }
td {  /* Table  */
  font-size: 8px;
}
h1.title {
  font-size: 38px;
  color: DarkRed;
}
h1 { /* Header 1 */
  font-size: 28px;
  color: DarkBlue;
}
h2 { /* Header 2 */
    font-size: 22px;
  color: DarkBlue;
}
h3 { /* Header 3 */
  font-size: 18px;
  font-family: "Times New Roman", Times, serif;
  color: DarkBlue;
}
code.r{ /* Code block */
    font-size: 12px;
}
pre { /* Code block - determines code spacing between lines */
    font-size: 14px;
}
</style>

```{r}
postprocess_dataset <- function(df, dataset_name)
{
  result <- df  %>%
    mutate(outcome = ordered(outcome)) %>%
    mutate_if(is_logical, as.integer) %>%
    mutate_if(is_character, as.factor)  %>%
    mutate(dataset = dataset_name) %>%
    mutate(home_goal_at_end = home_goal_min45,
           away_goal_at_end = away_goal_min45
           )
  result
}

dataset_WF_LT <- read_csv("data/dataset_WF_L_TOP.csv") %>% 
    postprocess_dataset("WF-LT")

dataset_WF_ODDS <- read_csv("data/dataset_WF_ODDS.csv") %>% 
    postprocess_dataset("WF-ODDS")

dataset_BA <- read_csv("data/dataset_BA.csv") %>% 
    postprocess_dataset("BA")
  
dataset_WF_BA <- read_csv("data/dataset_WF_BA.csv") %>%
    postprocess_dataset("WF-BA")

dataset_WF_L <- read_csv("data/dataset_WF_L.csv") %>%
    postprocess_dataset("WF-L")
  
dataset_WF_MAX93 <- read_csv("data/dataset_WF_MAX93.csv") %>% 
    postprocess_dataset("WF-MAX93")
  
  
dataset_WF <- dataset_WF_ODDS %>%
  filter(uri %in% dataset_WF_LT$uri) %>%
  filter(competition_ID != "swe-superettan")  %>%
  mutate(score_sum_first_halftime = away_score_first_halftime + home_score_first_halftime) %>%
  mutate(dataset = "WF")

LEVELS_OF_INTEREST <- dataset_WF %>% 
  filter(score_first_halftime %in% c("0:0", "0:1", "1:0", "1:1")) %>% 
  pull(goal_sequence) %>% 
  unique() 

dataset_WF_WFC <- dataset_WF %>% 
    mutate(goal_sequence = 
          goal_sequence %>% 
  fct_other(drop = setdiff(levels(goal_sequence) %>% 
                             keep(grepl("-$", .)), 
                          LEVELS_OF_INTEREST), 
            other_level = " Remaining: Home last") %>%
             fct_other(drop = setdiff(levels(goal_sequence) %>% keep(grepl("+$", .)), LEVELS_OF_INTEREST), other_level = " Remaining: Away last")) %>%
  mutate(goal_sequence = fct_relevel(goal_sequence, LEVELS_OF_INTEREST[1] %>% as.character())) %>%
  mutate(dataset = "WF-WFC")
#filter( !is.na(last_goal_age))
```


## Datasets

We consider 5 different datasets. One comes from the original PLoS paper. We construct the remaining four using the data scraped from Worldfootball.net (for goal timings and outcomes) and Betexplorer.com (for betting odds), starting from year 1993. No red card data are currently available for the Worldfootball.net dataset.

- **BA** -- the dataset from Baert and Amez original PLoS paper, downloaded from:

http://journals.plos.org/plosone/article?id=10.1371/journal.pone.0194255

We are not using the relative strength variable since it appears to be botched in the dataset: some values are extremely large, while others are very small, which appears to be inconsistent with the small mean and std. dev. reported in the paper. Also, we are not able to estimate any FE specification on this dataset since no information on teams or competitions is available.

Worldfootball-based datasets:

- **WF-BA** -- the dataset is an imperfect approximation of **BA** based on worldfootball.net data; it uses games from the same leagues and in the same timeframe as **BA**; contains more observations than **BA**

- **WF-ODDS** -- has betting odds for each game; was obtained by fuzzy-matching BetExplorer odds data to worldfootball.net games data. The betting odds are transformed to corresponding Poisson arrival rates  *lambda_home* and *lambda_away*(using the formula from Rudi and Groenevelt paper). Stronger teams have larger values of *lambda*.

- **WF-L** -- "main set" picked by Nils. Games from select leagues (see Appendix). Betting odds not merged.
- **WF-LT** -- a subset of **WF-L** with only top-tier leagues included

- **WF-MAX93** -- all available games. Betting odds not merged.

- **WF-WFC** -- the dataset used in the first table and the plot.

Our dataset, denoted WF-WFC, includes `r nrow(dataset_WF) %>% prettyNum(big.mark = " ")` games from year `r min(dataset_WF$year)` to `r max(dataset_WF$year)` from `r n_distinct(dataset_WF$competition_ID)` top-level national leagues. The game data from WorldFootball.net were merged with the betting odds data from BetExplorer based on the date of the game and fuzzy matching of team names---with two team names considered equal if their normalized Levenshtein similarity is above 0.5.

```{r}
var_labels <- read_csv("data/var_labels.csv") %>% mutate(var_rank = row_number())

dataset_ordering <- data.frame(dataset = c("BA", "WF-BA", "WF-ODDS", "WF-L", "WF-LT", "WF-WFC", "WF-MAX93")) %>%
  mutate(dataset_rank = row_number())

datasets = bind_rows(dataset_BA, 
                     dataset_WF_BA, 
                     dataset_WF_ODDS, 
                     dataset_WF_L, 
                     dataset_WF_LT, 
                     dataset_WF_WFC,
                     dataset_WF_MAX93) %>% 
  dplyr::select(which(sapply(.,class) %in% c("numeric", "integer")), dataset) %>%
  group_by(dataset) %>%
  mutate(game_id = row_number()) %>%
  ungroup() %>%
  gather(variable_name, value, -one_of("dataset", "game_id"))  %>% 
  group_by(dataset, variable_name) %>%
  summarise(Mean = mean(value, na.rm = T), SD = sd(value, na.rm = T), N = n()) %>%
  ungroup() %>%
  inner_join(dataset_ordering) %>%
  mutate(dataset = str_c(as.character(dataset), " (N =", format(N, big.mark = ","), ")")) %>% 
  inner_join(var_labels) %>% 
  mutate(variable_name = factor(variable_name)) %>%
  mutate(variable_name = factor(variable_name) %>% fct_reorder(var_rank),
         variable_description = factor(description) %>% fct_reorder(var_rank),
         dataset = factor(dataset) %>% fct_reorder(dataset_rank)
         ) %>% 
  arrange(dataset, variable_name) %>% 
  dplyr::select(-var_rank, -dataset_rank)

```

## Summary statistics

```{r}
tabular(Heading()*variable_description*DropEmpty("", "cell") ~ Heading("Dataset")*dataset*Heading()*max*((Mean + SD)*Format(digits = 1)), datasets %>% na.omit()) %>%
  toKable()
```

## Figure 1 (manuscript)

```{r, fig.width=14, fig.height=10}

table_1 <- dataset_WF %>%
  filter(score_first_halftime %in% c("1:0", "0:0", "0:1", "2:1", "1:1","1:2")) %>%
  mutate(min_score_first_halftime = pmin(home_score_first_halftime, away_score_first_halftime),
           condition_label = case_when(
    home_last_scorer_first_halftime & home_goal_min44 ~ "1. Home, minutes 44-45",
    home_last_scorer_first_halftime & !home_goal_min44 ~ "2. Home, minutes 1-43",
    away_last_scorer_first_halftime & !away_goal_min44 ~ "3. Away, minutes 1-43",
    away_last_scorer_first_halftime & away_goal_min44 ~ "4. Away, minutes 44-45",
    TRUE ~ "5. None" 
  )) %>%
  group_by(score_first_halftime, score_diff_first_halftime, min_score_first_halftime, outcome, condition_label) %>%
  summarise(cnt = n()) %>%
  ungroup() %>%
  group_by(score_first_halftime, score_diff_first_halftime, min_score_first_halftime, condition_label) %>%
  mutate(prop = cnt/sum(cnt, na.rm = TRUE)) %>%
  ungroup() %>%
  mutate(se = sqrt(prop*(1 - prop)/cnt),
         b_low = prop - 0.67*se,
         b_high = prop + 0.67*se
         ) %>%
  group_by(score_first_halftime, outcome) %>%
  mutate(n_obs = sum(cnt)) %>%
  ungroup()

p1 <- ggplot(table_1, aes(x = outcome, y = prop, 
                          fill = condition_label, group = condition_label
                          #,width = (cnt/1000)
                          )) + 
  geom_bar(stat="identity",  position = position_dodge2()) + 
  geom_errorbar(aes(ymin = b_low, ymax= b_high), position=position_dodge2())+
  geom_text(aes(y = prop + 0.15, label = scales::percent(round(prop,3))), position=position_dodge(width=0.9), vjust=-0.25, size = 4) +
  theme_pubr() +
  facet_grid(score_diff_first_halftime ~ min_score_first_halftime) +
  scale_fill_manual(values=c("#1F78B4", "#A6CEE3", "#B2DF8A", "#33A02C", "#D5DFD5"), name = "Last scoring team (with timing)") +
  theme(
    strip.text = element_blank(),
    axis.title = element_text(size = 24, face = "bold"),
    axis.text = element_text(size = 18),
    legend.text = element_text(size = 10),
    legend.position = "bottom",
    legend.direction = "horizontal",
    legend.title = element_text(size = 12, face = "bold")
    ) + 
  scale_x_discrete(name = "Outcome", expand=c(0.3,0), 
                   labels = c("Home win", "Tie", "Away win")) +
  scale_y_continuous(name = "Proportion", expand= expand_scale(add = c(0, 0.5)), breaks = c(0, 0.5, 1),
       labels = scales::percent) +
  geom_text(aes(x = 2, y = 1.25, label = score_first_halftime), size = 9) 

p1
```

## Table 1 (manuscript)

```{r, results="asis"}
cov_labels = c("1:0", "1:1, Away last", "0:1", "1:1, Home last", "Others, Home Last", "Others, Away Last",
               "Lambda, home team", "Lambda, away team",
               "Goal just before halftime",
               "1:0", "1:1, Away last", "0:1", "1:1, Home last", "Others, Home Last", "Others, Away Last"
               )
c.lm.model <- lm(formula = score_diff_final ~ goal_sequence + 
    last_goal_44:goal_sequence, data = dataset_WF_WFC)
c.lm.model.2 <- lm(formula = score_diff_final ~ goal_sequence + 
    last_goal_44:goal_sequence + score_diff_final + score_diff_first_halftime, data = dataset_WF_WFC)
c.lm.model.3 <- lm(formula = score_diff_final ~ goal_sequence + 
    last_goal_44:goal_sequence + lambda_home + lambda_away, data = dataset_WF_WFC)
c.lm.model.4 <- felm(formula = score_diff_final ~ goal_sequence + 
    last_goal_44:goal_sequence + lambda_home + lambda_away | homeTeam + awayTeam + competition_ID, data = dataset_WF_WFC)


c.glm.model <- run_ologit(outcome ~ goal_sequence + 
    last_goal_44:goal_sequence, dataset_WF_WFC)
c.glm.model.2 <- run_ologit(outcome ~ goal_sequence + 
    last_goal_44:goal_sequence + lambda_home + lambda_away, dataset_WF_WFC)
stargazer(c.lm.model, c.lm.model.2, c.lm.model.3, c.lm.model.4, c.glm.model, c.glm.model.2, 
          type = "html",
          omit.stat = c("f", "ser"),
          dep.var.labels  = c("Score difference, 2nd halftime", "Outcome"),
          #,covariate.labels = cov_labels,
          add.lines=list(c("Team and competition FE", "No", "No","No","Yes","No","No"))
          ,omit = c("Constant"),
          model.names = FALSE
)
```

## Robustness checks

- Dependent variable -- final score difference
- Statistical method: OLS regression
- Specifications (1) and (2) are the same as specifications (1) and (2) from PLoS one paper and are applied to the same dataset (dataset BA). 
- Specification (3) is the same as (1), but applied to dataset WF-BA -- our replica of dataset BA.
- Spec (4) is similar to (2), but doesn't include group phase or Europa League dummies; it is also applied to all other datasets. 
- Specs (5) and (6) are the same as (3) and (4) but estimated on the dataset WF-ODDS
- Spec (7) augments (6) with the team strength parameters estimated from betting odds 
- Specs (8), (9), (10) and (11) are the same as (3), but for WF-L, WF-LT, WFC, WF-MAX93 datasets respectively. 
    + WF-LT dataset used for spec (9) is a subset of WF-L restricted to top-tier leagues only
- Spec (12) adds team and tournament fixed effects to spec (11).
- Spec (13) adds goal sequence fixed effects to spec (11) and removes the controls which depend on goal sequence only
- Conclusion: (1) and (2) replicate the results of Baert and Amez -- the coefficient on **home_goal_min44** is positive and significant. But for all other datasets, the result is reversed: the coefficient is negative, meaning that the home team scores more in the second halftime if it scored just before the halftime.


```{r}
response_formula <- score_diff_final ~ 1
basic_predictors <- ~ . + home_goal_min44 + away_goal_min44
fm0 <- update(response_formula, basic_predictors)
fm0a <- score_diff_final ~ 1

fm1 <- update(fm0, ~ . + score_diff_first_halftime + home_score_first_halftime)
fm2 <- update(fm0, ~ . + score_diff_first_halftime + home_score_first_halftime + competition_uefa_el + competition_group_phase + home_last_scorer_first_halftime + away_last_scorer_first_halftime)
fm3 <- update(fm0, ~ . + score_diff_first_halftime + home_score_first_halftime  + home_last_scorer_first_halftime + away_last_scorer_first_halftime)
fm4 <- update(fm0, ~ . + home_last_scorer_first_halftime + away_last_scorer_first_halftime + score_diff_first_halftime + home_score_first_halftime + lambda_away + lambda_home)
fm5 <- update(fm0, ~ . + score_diff_first_halftime + home_score_first_halftime  + home_last_scorer_first_halftime + away_last_scorer_first_halftime)
fm5 <-  as.formula(str_c(paste(deparse(fm5), collapse = ""), " | homeTeam + awayTeam + competition_ID"))
fm6 <- as.formula(str_c(paste(deparse(fm0), collapse = ""), " | homeTeam + awayTeam + competition_ID + goal_sequence"))

mdl1 <- lm(fm1, data = dataset_BA)
mdl2 <- lm(fm2, data = dataset_BA)
mdl3 <- lm(fm1, data = dataset_WF_BA)
mdl4 <- lm(fm3, data = dataset_WF_BA) 
mdl5 <- lm(fm1, data = dataset_WF_ODDS)
mdl6 <- lm(fm3, data = dataset_WF_ODDS) 
mdl7 <- lm(fm4, data = dataset_WF_ODDS) 
mdl8 <- lm(fm3, data = dataset_WF_L)
mdl9 <- lm(fm3, data = dataset_WF_LT)
mdl9.1 <- lm(fm3, data = dataset_WF_WFC)
mdl10 <- lm(fm3, data = dataset_WF_MAX93)
mdl11 <- felm(fm5, data = dataset_WF_MAX93) 
mdl12 <- felm(fm6, data = dataset_WF_MAX93)
```

```{r, results='asis'}
stargazer(list(mdl1, mdl2, mdl3, mdl4, mdl5, mdl6, mdl7, mdl8, mdl9, mdl9.1, mdl10, mdl11, mdl12), 
          type = 'html',
          dep.var.caption   = "Score difference (negative for home win, positive for away win)",
          dep.var.labels.include = FALSE,
          omit.stat = c("f", "ser"),
          add.lines=list(
            c("Team FE","","","", "","","", "","","", "", "", "Yes", "Yes"),
            c("Competition FE","","","", "","","", "","","", "", "", "Yes", "Yes"),
            c("Goal sequence FE","","","", "","","", "","","", "", "", "", "Yes")
            ),
          model.names = FALSE,
          column.labels   = c("BA", "WF-BA", "WF-ODDS", "WF-L", "WF-LT", "WFC", "WF-MAX93"),
          column.separate = c(2, 2, 3, 1, 1, 1, 3)
          )
```
